suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "/Users/s-mitsutomi/My Drive (shuheimitsutomi@ric.u-tokyo.ac.jp)/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/DRS_m3C_expression/')
tabledir <- paste0(wd, 'Tables/DRS/Expression_m3C/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

# espresso_deseq2 <- 
#   read_tsv(
#     paste0(wd, 'Tables/Espresso/espresso_deseq2_genetype2_isDET_2024-04-18.tsv')
#   )
# espresso_deseq2

drs_cpm <- 
  read_tsv(
    paste0(wd, 'Tables/DRS_quantification/espresso_quantification_cpm_2024-04-19.tsv.gz')
  )
## Rows: 330453 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (14): transcript_id, transcript_name, gene_id, type, si, seqname, source...
## dbl  (6): rep, count, total_reads, cpm, start, end
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
drs_cpm
## # A tibble: 330,453 × 20
##    transcript_id     transcript_name gene_id type  si      rep count total_reads
##    <chr>             <chr>           <chr>   <chr> <chr> <dbl> <dbl>       <dbl>
##  1 ENST00000498442.1 CRBN-212        ENSG00… siME… I         1  0        3552783
##  2 ENST00000498442.1 CRBN-212        ENSG00… siME… I         2  1         997879
##  3 ENST00000498442.1 CRBN-212        ENSG00… siME… I         3  0        2778705
##  4 ENST00000498442.1 CRBN-212        ENSG00… siME… G         1  0        3497396
##  5 ENST00000498442.1 CRBN-212        ENSG00… siME… G         2  0        3810844
##  6 ENST00000498442.1 CRBN-212        ENSG00… siME… G         3  0        3668094
##  7 ENST00000498442.1 CRBN-212        ENSG00… Cont  D         1  1        2701773
##  8 ENST00000498442.1 CRBN-212        ENSG00… Cont  D         2  1        3406597
##  9 ENST00000498442.1 CRBN-212        ENSG00… Cont  D         3  0        3653792
## 10 ENST00000459840.5 CRBN-205        ENSG00… siME… I         1  1.08     3552783
## # ℹ 330,443 more rows
## # ℹ 12 more variables: cpm <dbl>, seqname <chr>, source <chr>, feature <chr>,
## #   start <dbl>, end <dbl>, score <chr>, strand <chr>, frame <chr>,
## #   gene_type <chr>, gene_name <chr>, transcript_type <chr>
common_intensity_up_positions <- 
  read_tsv(
    paste0(wd, 'Tables/DRS/Positions/common_sig_seqs_in_intensity_up_2024-04-10.tsv.gz')
  )
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
common_intensity_up_positions
## # A tibble: 605 × 65
##    transcript_id     transcript_name position ref_kmer GMM_logit_pvalue_G
##    <chr>             <chr>              <dbl> <chr>                 <dbl>
##  1 ENST00000429711.7 RPL32-204            422 GCCCA                 1    
##  2 ENST00000647248.2 RPL35A-211           380 ACCCC                 1    
##  3 ENST00000647248.2 RPL35A-211           381 CCCCT                 1    
##  4 ENST00000389680.2 MT-RNR1-201           43 ACACA                 1    
##  5 ENST00000389680.2 MT-RNR1-201           57 CCCCG                 1    
##  6 ENST00000389680.2 MT-RNR1-201           71 GTTCA                 1    
##  7 ENST00000389680.2 MT-RNR1-201           73 TCACC                 1    
##  8 ENST00000389680.2 MT-RNR1-201           75 ACCCT                 0.777
##  9 ENST00000389680.2 MT-RNR1-201           93 ATCAA                 1    
## 10 ENST00000389680.2 MT-RNR1-201          138 GCTTA                 1    
## # ℹ 595 more rows
## # ℹ 60 more variables: KS_dwell_pvalue_G <dbl>, KS_intensity_pvalue_G <dbl>,
## #   GMM_cov_type_G <chr>, GMM_n_clust_G <dbl>, cluster_counts_G <chr>,
## #   Logit_LOR_G <dbl>, c1_mean_intensity_G <dbl>, c2_mean_intensity_G <dbl>,
## #   c1_median_intensity_G <dbl>, c2_median_intensity_G <dbl>,
## #   c1_sd_intensity_G <dbl>, c2_sd_intensity_G <dbl>, c1_mean_dwell_G <dbl>,
## #   c2_mean_dwell_G <dbl>, c1_median_dwell_G <dbl>, c2_median_dwell_G <dbl>, …
num_m3C_sites <- 
  common_intensity_up_positions |> 
  filter(grepl('.{2}C.{2}', ref_kmer)) |> 
  group_by(transcript_id, transcript_name) |> 
  reframe(num_sites = n()) |> 
  ungroup()
num_m3C_sites
## # A tibble: 71 × 3
##    transcript_id      transcript_name num_sites
##    <chr>              <chr>               <int>
##  1 ENST00000009589.8  RPS20-201               1
##  2 ENST00000199764.7  CEACAM6-201             1
##  3 ENST00000202773.14 RPL6-201                2
##  4 ENST00000215754.8  MIF-201                 4
##  5 ENST00000229239.10 GAPDH-201               2
##  6 ENST00000230050.4  RPS12-201               4
##  7 ENST00000233143.6  TMSB10-201             15
##  8 ENST00000234875.9  RPL22-201               2
##  9 ENST00000243997.8  ATP5F1E-201             3
## 10 ENST00000254810.8  H3-3B-201               1
## # ℹ 61 more rows
expression_num_m3C_sites <- 
  drs_cpm |> 
  left_join(num_m3C_sites) |> 
  replace_na(list(num_sites = 0)) |> 
  select(transcript_id, cpm, num_sites, everything()) |> 
  mutate(log10_cpm_plus = log10(cpm + .01))
## Joining with `by = join_by(transcript_id, transcript_name)`
expression_num_m3C_sites
## # A tibble: 330,453 × 22
##    transcript_id   cpm num_sites transcript_name gene_id type  si      rep count
##    <chr>         <dbl>     <int> <chr>           <chr>   <chr> <chr> <dbl> <dbl>
##  1 ENST00000498… 0             0 CRBN-212        ENSG00… siME… I         1  0   
##  2 ENST00000498… 1.00          0 CRBN-212        ENSG00… siME… I         2  1   
##  3 ENST00000498… 0             0 CRBN-212        ENSG00… siME… I         3  0   
##  4 ENST00000498… 0             0 CRBN-212        ENSG00… siME… G         1  0   
##  5 ENST00000498… 0             0 CRBN-212        ENSG00… siME… G         2  0   
##  6 ENST00000498… 0             0 CRBN-212        ENSG00… siME… G         3  0   
##  7 ENST00000498… 0.370         0 CRBN-212        ENSG00… Cont  D         1  1   
##  8 ENST00000498… 0.294         0 CRBN-212        ENSG00… Cont  D         2  1   
##  9 ENST00000498… 0             0 CRBN-212        ENSG00… Cont  D         3  0   
## 10 ENST00000459… 0.304         0 CRBN-205        ENSG00… siME… I         1  1.08
## # ℹ 330,443 more rows
## # ℹ 13 more variables: total_reads <dbl>, seqname <chr>, source <chr>,
## #   feature <chr>, start <dbl>, end <dbl>, score <chr>, strand <chr>,
## #   frame <chr>, gene_type <chr>, gene_name <chr>, transcript_type <chr>,
## #   log10_cpm_plus <dbl>
correlation_expression_num_m3C_sites <- 
  expression_num_m3C_sites |> 
  ggplot(aes(x = cpm, y = num_sites)) +
  geom_hex() +
  scale_x_log10() +
  scale_fill_viridis_c(trans = 'log10') +
  labs(x = 'log10(CPM + .1)', y = 'the number of m3C sites') 
correlation_expression_num_m3C_sites |> 
  ggsave_multiple_formats(
    width = 5, height = 5, fontsize = 7, outdir = figdir
  )
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 145933 rows containing non-finite outside the scale range
## (`stat_binhex()`).